{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Chapter 8: Customizing Functions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Apply" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This instance happens when you want to perform more complex calculations on your data rather than addind or substracting two columns." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Things of such kind might prevent you from wrting a for loop to iterate over a whole column for instance" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/rockefeller/.local/lib/python3.9/site-packages/pandas/core/arrays/masked.py:64: UserWarning: Pandas requires version '1.3.2' or newer of 'bottleneck' (version '1.2.1' currently installed).\n", " from pandas.core import (\n" ] } ], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def my_function():\n", " # indent 4 spaces\n", " # function code\n", " pass" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def my_sq(x):\n", " \"\"\"squares a given value\n", " \"\"\"\n", " return x ** 2" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_sq(2)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "16" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_sq(4)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def avg_2(x, y):\n", " \"\"\"calculates average between 2 numbers\n", " \"\"\"\n", " return (x + y) / 2.0" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "15.0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "avg_2(10, 20)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df1 = pd.DataFrame({\n", " \"A\": [5,10,15],\n", " \"B\": [3,6,9]\n", " })" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
053
1106
2159
\n", "
" ], "text/plain": [ " A B\n", "0 5 3\n", "1 10 6\n", "2 15 9" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 25\n", "1 100\n", "2 225\n", "Name: A, dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1['A'].apply(my_sq)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1['A'] ** 2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we want to understand how an **apply function** works on a dataframe" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def print_me(x):\n", " print(x)\n", "df1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1.apply(print_me)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def avg_3(x, y,z):\n", " \"\"\"avg of 3 numbers\n", " \"\"\"\n", " return (x + y +z ) / 3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1.apply(avg_3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def Avg_3(col):\n", " \"\"\"perform the average over a dataframe\"\"\"\n", " x = col[0]\n", " y = col[1]\n", " z = col[2]\n", " return (x + y + z) / 3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1.apply(Avg_3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df1.apply(Avg_3, axis=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def avg_2(row):\n", " \"\"\"avg of 2 numbers\n", " \"\"\"\n", " x = row[0]\n", " y = row[1]\n", " return (x + y) / 2\n", "df1.apply(avg_2, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns\n", "\n", "titanic = sns.load_dataset('titanic')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "titanic.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "titanic.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Write a function which is going to return a count of missing values column-wise" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def count_missing(vec):\n", " null_vec = pd.isnull(vec)\n", " null_count = np.sum(null_vec)\n", " return null_count" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "titanic.apply(count_missing)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "titanic.apply(count_missing, axis=1).value_counts() #549 rows have 1 missing values 182 have 0 missing value" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " 2. Write a function which is going to return a proportion of the missing values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#A proportion is just what I took over the size of the whole thing" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def prop_missing(vec):\n", " num = count_missing(vec)\n", " den = vec.size\n", " return num / den" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "titanic.apply(prop_missing)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " 3. Write a function which computes the remaining proportion" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def prop_complete(vec):\n", " return 1 - prop_missing(vec)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "titanic.apply(prop_missing)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#titanic.loc[pd.isnull(titanic['embark_town']), :]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 2 }